####################################################################################

#Detecting Misinformation: Identifying False News Spread by Political Leaders in the Global South
#This file creates the politician level analysis for Table 2, Figures 1(b), 7 and 8 and data cited in 
#the paper related to politician-level analyses. It also includes several online appendix analyses.
#See readme.txt for replication instructions

####################################################################################
rm(list=ls())

options(scipen=999) # suppressing scientific notation

pkgs <- c("tidyverse", "estimatr", "xtable", "readxl",
          "MASS", "sampleSelection", "rockchalk", "modelsummary", 
          "texreg", "broom")
sapply(pkgs, require, character.only = TRUE)

#load politician level dataset
pol <- read_rds("Data/final-politician-dat.rds") %>%
  #Change reference group for variables and normalizing variables
  mutate(elec_coalition1 = relevel(as.factor(elec_coalition), ref = "other"),
         age1 = case_when(age < 30 ~ "22-29", 
                          age >= 30 & age <= 44 ~ "30-44", 
                          age >= 45 & age < 65 ~ "45-64", 
                          age >= 65 ~ "65+"), 
         age2 = relevel(as.factor(age1), ref = "22-29"), 
         ideo_alt_r = (ideo_alt - min(ideo_alt, na.rm = T))/(max(ideo_alt, na.rm = T) - min(ideo_alt, na.rm = T)),
         posts_logged = log(total_posts), 
         posts_logged_r = (posts_logged - min(posts_logged))/(max(posts_logged) - min(posts_logged)), 
         total_fake_posts_r = (total_fake_posts - min(total_fake_posts))/(max(total_fake_posts) - min(total_fake_posts)), 
         total_fake_posts_url_r = (total_fake_posts_url - min(total_fake_posts_url))/(max(total_fake_posts_url) - min(total_fake_posts_url)),
         total_fake_posts_gdi_r = (total_fake_posts_gdi - min(total_fake_posts_gdi))/(max(total_fake_posts_gdi) - min(total_fake_posts_gdi)))

#due to data sharing agreements we are unable to share post-level data

#========================================================================
#Figure 1: Overlap in politicians identified as sharing false content by detection approach
table(pol$fake_dummy, pol$fake_dummy_url) #text approach vs. url approach

#========================================================================
#Table 2: Table 2: Prevalence of Politicians Sharing False Content and Posts Containing False Content by Detection Approach.

#column 3
#% of politicians with a fake post using text-based approach
text <- prop.table(table(pol$fake_dummy))*100

#% of politicians with a fake post using url approach
domain <- prop.table(table(pol$fake_dummy_url))*100

#% of politicians with a fake post using SS1 full URL
ss1 <- prop.table(table(pol$full_fb_url>0))*100 

#Combining to create table of percentage of politicians sharing misinfo
pct <- as.data.frame(matrix(
  c("Text", "Domain", "Facebook URL","n total",
    as.numeric(text[2]), 
    as.numeric(domain[2]),
    as.numeric(ss1[2]), nrow(pol)), 4, 2))

pct <- pct %>% mutate(V2 = as.numeric(as.character(V2)), 
                      V1 = as.character(V1)) %>% rename(DetectionApproach = V1, Pct.Pol. = V2)

#------------------------
#column 2
#total posts identified shared misinfo by by approach
text <- sum(pol$total_fake_posts)
domain <- sum(pol$total_fake_posts_url)
ss1 <- sum(pol$full_fb_url) 

#Combining to create table of total posts with misinfo by approach
total_false <- as.data.frame(matrix(
  c("Text", "Domain",  "Facebook URL", "n total",
    text, domain, ss1, 
    NA), 4, 2))

total_false <- total_false %>% mutate(V2 = as.numeric(as.character(V2)), 
                                  V1 = as.character(V1)) %>% rename(DetectionApproach = V1, Total.Identified = V2)

#------------------------
#column 4
#% of posts containing misinformation
total <- sum(pol$total_posts)

pct_posts <- as.data.frame(matrix(
  c("Text", "Domain",  "Facebook URL", "n total",
    (text/total)*100, (domain/total)*100, (ss1/total)*100, 
    total), 4, 2)) 

pct_posts <- pct_posts %>% mutate(V2 = as.numeric(as.character(V2)), 
                                  V1 = as.character(V1)) %>% rename(DetectionApproach = V1, Pct.Posts = V2)

#combining into full table

sums <- t(as.data.frame(c(NA, NA, nrow(pol), NA, total))) #total number

table2 <- bind_cols(total_false, pct, pct_posts) %>% dplyr::select(-c(3,5))

#final table 2
table2 <- xtable(table2, digits = 4)
print(table2, file = "Tables/table2.tex")

#=============================================================================
# Figure 7 - OLS with robust standard errors

#dummy variable for text-based approach
m1_d <- lm_robust(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                  fixed_effects = ~position,
                  data = pol)
m2_d <- lm_robust(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                  fixed_effects = ~position,
                  data = pol)

# # coefficient plot
# 
#sex
model_1_sex <- data.frame(tidy(m1_d)[1,], type = .2, lbl = "Text-Based Approach (Binary)")
model_2_sex <- data.frame(tidy(m2_d)[1,], type = .4, lbl = "Domain-Based Approach (Binary)")

#age1
model_1_age1 <- data.frame(tidy(m1_d)[2,], type = .8, lbl = "Text-Based Approach (Binary)")
model_2_age1 <- data.frame(tidy(m2_d)[2,], type = 1.0, lbl = "Domain-Based Approach (Binary)")

#age2
model_1_age2 <- data.frame(tidy(m1_d)[3,], type = 1.4, lbl = "Text-Based Approach (Binary)")
model_2_age2 <- data.frame(tidy(m2_d)[3,], type = 1.6, lbl = "Domain-Based Approach (Binary)")

#age3
model_1_age3 <- data.frame(tidy(m1_d)[4,], type = 2.0, lbl = "Text-Based Approach (Binary)")
model_2_age3 <- data.frame(tidy(m2_d)[4,], type = 2.2, lbl = "Domain-Based Approach (Binary)")

#education
model_1_educ <- data.frame(tidy(m1_d)[5,], type = 2.6, lbl = "Text-Based Approach (Binary)")
model_2_educ <- data.frame(tidy(m2_d)[5,], type = 2.8, lbl = "Domain-Based Approach (Binary)")

#electoral alignment -- bolsonaro
model_1_bolso <- data.frame(tidy(m1_d)[6,], type = 3.2, lbl = "Text-Based Approach (Binary)")
model_2_bolso <- data.frame(tidy(m2_d)[6,], type = 3.4, lbl = "Domain-Based Approach (Binary)")

#electoral alignment -- haddad
model_1_hadd <- data.frame(tidy(m1_d)[7,], type = 3.8, lbl = "Text-Based Approach (Binary)")
model_2_hadd <- data.frame(tidy(m2_d)[7,], type = 4.0, lbl = "Domain-Based Approach (Binary)")

#ideology
model_1_ideo <- data.frame(tidy(m1_d)[8,], type = 4.4, lbl = "Text-Based Approach (Binary)")
model_2_ideo <- data.frame(tidy(m2_d)[8,], type = 4.6, lbl = "Domain-Based Approach (Binary)")

#experience
model_1_exp <- data.frame(tidy(m1_d)[9,], type = 5.0, lbl = "Text-Based Approach (Binary)")
model_2_exp <- data.frame(tidy(m2_d)[9,], type = 5.2, lbl = "Domain-Based Approach (Binary)")

plot_df <- rbind.data.frame(model_1_sex, model_2_sex, 
                            model_1_age1, model_2_age1, 
                            model_1_age2, model_2_age2,  
                            model_1_age3, model_2_age3, 
                            model_1_educ, model_2_educ,  
                            model_1_bolso, model_2_bolso,  
                            model_1_hadd, model_2_hadd,  
                            model_1_ideo, model_2_ideo,  
                            model_1_exp, model_2_exp,  stringsAsFactors = FALSE)
 
brks <- c(.3, .9, 1.5, 2.1, 2.7, 3.3, 3.9, 4.5, 5.1)
lbl <- c("Sex\n(Male = 1)", "Age\n(30-44)", "Age\n(45-64)", "Age\n(65+)", "Education\n(College = 1)",
         "Electoral alignment\n(Bolsonaro = 1)", "Electoral alignment\n(Haddad = 1)", "Ideology",
         "Political experience")

ggplot(data = plot_df, aes(x = type, y = estimate,
                           color = lbl)) +
  geom_linerange(aes(
    ymin = estimate - 1.65 * std.error,
    ymax = estimate + 1.65 * std.error),
    lwd = 1.25, show.legend = FALSE) +
  geom_errorbar(aes(
    ymin = estimate - 1.96 * std.error,
    ymax = estimate + 1.96 * std.error),
    width = .25, show.legend = FALSE) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_point(size = 2.5) +
  scale_x_continuous(expression("Coefficient"),
                     breaks = brks, labels = lbl) +
  ylab("Estimate") +
  scale_color_manual("Model",
                     values = c("black", "grey70")) +
  theme_minimal() +
  coord_flip() +
  theme(legend.position = "bottom")


#save figures
ggsave("Figures/fig7.pdf", width = 9, height = 8, units = "in")

#=====================================================================================
# Table for Figure 8 - Quasi-poisson

#count variable for text-based approach
m1_c <- glm(total_fake_posts ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
            family="quasipoisson", data=pol)

#count variable for domain-based approach
m2_c <- glm(total_fake_posts_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
            family="quasipoisson", data=pol)

#=====================================
# coefficient plot for Figure 8
#=====================================

#sex
model_3_sex <- data.frame(tidy(m1_c)[2,], type = .2, lbl = "Text-Based Approach (Count)")
model_4_sex <- data.frame(tidy(m2_c)[2,], type = .4, lbl = "Domain-Based Approach (Count)")

#age1
model_3_age1 <- data.frame(tidy(m1_c)[3,], type = .8, lbl = "Text-Based Approach (Count)")
model_4_age1 <- data.frame(tidy(m2_c)[3,], type = 1.0, lbl = "Domain-Based Approach (Count)")

#age2

model_3_age2 <- data.frame(tidy(m1_c)[4,], type = 1.4, lbl = "Text-Based Approach (Count)")
model_4_age2 <- data.frame(tidy(m2_c)[4,], type = 1.6, lbl = "Domain-Based Approach (Count)")

#age3
model_3_age3 <- data.frame(tidy(m1_c)[5,], type = 2.0, lbl = "Text-Based Approach (Count)")
model_4_age3 <- data.frame(tidy(m2_c)[5,], type = 2.2, lbl = "Domain-Based Approach (Count)")

#education
model_3_educ <- data.frame(tidy(m1_c)[6,], type = 2.6, lbl = "Text-Based Approach (Count)")
model_4_educ <- data.frame(tidy(m2_c)[6,], type = 2.8, lbl = "Domain-Based Approach (Count)")

#electoral alignment -- bolsonaro
model_3_bolso <- data.frame(tidy(m1_c)[7,], type = 3.2, lbl = "Text-Based Approach (Count)")
model_4_bolso <- data.frame(tidy(m2_c)[7,], type = 3.4, lbl = "Domain-Based Approach (Count)")

#electoral alignment -- haddad
model_3_hadd <- data.frame(tidy(m1_c)[8,], type = 3.8, lbl = "Text-Based Approach (Count)")
model_4_hadd <- data.frame(tidy(m2_c)[8,], type = 4.0, lbl = "Domain-Based Approach (Count)")

#ideology
model_3_ideo <- data.frame(tidy(m1_c)[9,], type = 4.4, lbl = "Text-Based Approach (Count)")
model_4_ideo <- data.frame(tidy(m2_c)[9,], type = 4.6, lbl = "Domain-Based Approach (Count)")

#experience
model_3_exp <- data.frame(tidy(m1_c)[10,], type = 5.0, lbl = "Text-Based Approach (Count)")
model_4_exp <- data.frame(tidy(m2_c)[10,], type = 5.2, lbl = "Domain-Based Approach (Count)")

#make plot data
plot_df <- rbind.data.frame(model_3_sex, model_4_sex,
                            model_3_age1, model_4_age1,
                            model_3_age2, model_4_age2,
                            model_3_age3, model_4_age3,
                            model_3_educ, model_4_educ,
                            model_3_bolso, model_4_bolso,
                            model_3_hadd, model_4_hadd,
                            model_3_ideo, model_4_ideo,
                            model_3_exp, model_4_exp,
                            stringsAsFactors = FALSE)

#breaks and labels for coefficient plot
brks <- c(.3, .9, 1.5, 2.1, 2.7, 3.3, 3.9, 4.5, 5.1)
lbl <- c("Sex\n(Male = 1)", "Age\n(30-44)", "Age\n(45-64)", "Age\n(65+)", "Education\n(College = 1)",
         "Electoral alignment\n(Bolsonaro = 1)", "Electoral alignment\n(Haddad = 1)", "Ideology",
         "Political experience")

#=====================================
#coefficient plot for Figure 8
#=====================================
ggplot(data = plot_df, aes(x = type, y = estimate,
                           color = lbl)) +
  geom_linerange(aes(
    ymin = estimate - 1.65 * std.error,
    ymax = estimate + 1.65 * std.error),
    lwd = 1.25, show.legend = FALSE) +
  geom_errorbar(aes(
    ymin = estimate - 1.96 * std.error,
    ymax = estimate + 1.96 * std.error),
    width = .25, show.legend = FALSE) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_point(size = 2.5) +
  scale_x_continuous(expression("Coefficient"),
                     breaks = brks, labels = lbl) +
  ylab("Estimate") + 
  scale_color_manual("Model",
                     values = c("black", "grey70")) +
  theme_minimal() +
  coord_flip() +
  theme(legend.position = "bottom")

#save figures
ggsave("Figures/fig8.pdf", width = 8, height = 8, units = "in")

#======================================================================
#======================================================================
      ################## Data cited in paper ################## 
#======================================================================
#======================================================================

#to detect the presence of misinformation in about 4 million posts from Facebook, Twitter, and 
#Instagram by 945 politicians in Brazil between 2018 and 2020.
sum(pol$total_posts) #4032907
nrow(pol) #945

#Our results indicate that political leaders rarely share misinformation. Each of
#the three approaches—text-based, domain-based, and Facebook-URL—find that less than
#1% of all posts contain misinformation.

(sum(pol$total_fake_posts)/sum(pol$total_posts))*100 > 1 #text approach
(sum(pol$total_fake_posts_url)/sum(pol$total_posts))*100 > 1 #domain approach
(sum(pol$full_fb_url)/sum(pol$total_posts))*100 > 1 #full URL approach

#Yet, we find significant variation across these three detection approaches examined.
#The number of posts containing misinformation can range from 50 to 38,695, and the percentage
#of politicians who have shared misinformation can also range from 1.9% (18) to
#50.5% (478).

sum(pol$full_fb_url) #50
sum(pol$total_fake_posts_url) #38695

table(pol$full_fb_url > 0)/nrow(pol) #full URL approach
table(pol$full_fb_url > 0) #full URL approach

table(pol$total_fake_posts_url > 0)/nrow(pol) #domain approach
table(pol$total_fake_posts_url > 0) #domain approach

#With the Facebook URL approaches, we find even greater rates
#of mismatch: at least 91.8% (134) of the politicians identified in the text-based approach
#are not identified in Facebook-URL approaches, and 33.3% (6) of the politicians identified
#in the URL approach were not identified in the text-based approach.

table(pol$fake_dummy, pol$full_fb_url >0)/sum(pol$fake_dummy) #91.8%
table(pol$fake_dummy, pol$full_fb_url >0) #134

table(pol$fake_dummy, pol$full_fb_url >0)/sum(pol$full_fb_url>0) #33.3%
table(pol$fake_dummy, pol$full_fb_url >0) #6


#======================================================================
#======================================================================
      ################## Online Appendices ################## 
#======================================================================
#======================================================================

##=======
#Tables for Figures 7 and 8
##=======

#regression table for results - Table 12
#Table for Figure 7 - Predictors of Sharing Misinformation (Binary) by Different Detection Approaches
print(texreg(list(m1_d, m2_d), include.ci = FALSE, 
             custom.coef.names = c("Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)")),
      file = "Tables/Table-for-Figure7.tex")

#regression table for results - Table 13
#Table for Figure 8 - Predictors of Sharing Misinformation (Count) by Different Detection Approaches
print(texreg(list(m1_c, m2_c), include.ci = FALSE,
             custom.coef.names = c("(Intercept)", "Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)", 
                                   "Position = Federal Deputy", 
                                   "Position = Governor",
                                   "Position = Minister",
                                   "Position = President",
                                   "Position = Senator",
                                   "Position = Vice-Governor")),
      file = "Tables/Table-for-Figure8.tex")

##=======
#Additional Analyses for Operationalizations of Ideology
##=======

#Ideology by the median 
m1_median <- lm_robust(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideology_median + pol_exp + posts_logged_r,
                       fixed_effects = ~position,
                       data = pol)
m2_median <- lm_robust(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideology_median + pol_exp + posts_logged_r,
                       fixed_effects = ~position,
                       data = pol)

print(texreg(list(m1_median, m2_median), custom.coef.names = c("Sex (Male = 1)", "Age (30-44)", 
                                                               "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                                               "Electoral alignment (Bolsonaro = 1)", 
                                                               "Electoral alignment (Lula = 1)", 
                                                               "Ideology (Right-Wing = 1)", 
                                                               "Political Experience", 
                                                               "Number of posts (logged)"),
             include.ci = FALSE), 
      file = "Tables/Table-Fig1-ideologymedian.tex")

#Ideology by quantile 90%-10%
m1_extreme <- lm_robust(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideology_extreme + pol_exp + posts_logged_r,
                        fixed_effects = ~position,
                        data = pol)
m2_extreme <- lm_robust(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideology_extreme + pol_exp + posts_logged_r,
                        fixed_effects = ~position,
                        data = pol)

print(texreg(list(m1_extreme, m2_extreme), include.ci = FALSE, 
             custom.coef.names = c("Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology (Not extreme = 1)", 
                                   "Political Experience", 
                                   "Number of posts (logged)")),
      file = "Tables/Table-Fig1-ideologyextreme.tex")

#Ideology by left-right extreme
m1_extreme_levels <- lm_robust(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideology_extreme_levels + pol_exp + posts_logged_r,
                               fixed_effects = ~position,
                               data = pol)
m2_extreme_levels <- lm_robust(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideology_extreme_levels + pol_exp + posts_logged_r,
                               fixed_effects = ~position,
                               data = pol)

print(texreg(list(m1_extreme_levels, m2_extreme_levels), include.ci = FALSE, 
             custom.coef.names = c("Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology (Extreme Right = 1)", 
                                   "Ideology (Not Extreme = 1)",
                                   "Political Experience", 
                                   "Number of posts (logged)")),
      file = "Tables/Table-Fig1-ideologyextremelevel.tex")

###=======
#Identification of URLs (``repeat offenders'')
###=======
#Figure 29: Number of politicians flagged as sharing misinformation using the ``Repeat Offenders" URL 
#approach, the Text-Based approach and both approaches.
table(pol$fake_dummy, pol$fake_dummy_url_repeat_repeat_offender)

#Results with repeat offenders and Global Disinformation Index (GDI) list

# Binary: Table 17 - Table for Figure 7 - Predictors of Sharing Misinformation (Binary) by Different Detection Approaches

m1_d <- lm_robust(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                  fixed_effects = ~position,
                  data = pol)
m2_d <- lm_robust(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                  fixed_effects = ~position,
                  data = pol)
m3_d <- lm_robust(fake_dummy_url_repeat_repeat_offender ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                  fixed_effects = ~position,
                  data = pol)
m4_d <- lm_robust(fake_dummy_gdi ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                      fixed_effects = ~position,
                      data = pol)


print(texreg(list(m1_d, m2_d, m3_d, m4_d), include.ci = FALSE, 
             custom.coef.names = c("Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)")),
      file = "Tables/Table-Fig7-Repeated-Offenders.tex")

# Count - Table 18 - Table for Figure 8 - Predictors of Sharing Misinformation (Count) by Different Detection Approaches

m1_c <-  glm(total_fake_posts ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
             family="quasipoisson", data=pol)
m2_c <- glm(total_fake_posts_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
            family="quasipoisson", data=pol)
m3_c <- glm(total_fake_posts_url_repeat_offender ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
            family="quasipoisson", data=pol)
m4_c <- glm(total_fake_posts_gdi ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
                family="quasipoisson", data=pol)

print(texreg(list(m1_c, m2_c, m3_c, m4_c), include.ci = FALSE, 
             custom.coef.names = c("(Intercept)", "Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)", 
                                   "Position = Federal Deputy", 
                                   "Position = Governor",
                                   "Position = Minister",
                                   "Position = President",
                                   "Position = Senator",
                                   "Position = Vice-Governor")),
      file = "Tables/Table-Fig8-Repeated-Offenders.tex")

###=======
#Additional Binary Outcome Model Specifications
###=======

###text-based###

#OLS
m1_d <- lm_robust(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                  fixed_effects = ~position,
                  data = pol)

#probit
m1_d_probit <- glm(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + 
                     posts_logged_r +as.factor(position), family = binomial(link = "probit"), data=pol)

#logit
m1_d_logit <- glm(fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + 
                    posts_logged_r +as.factor(position), family = binomial(link = "logit"), data=pol)

#table output
print(texreg(list(m1_d, m1_d_probit, m1_d_logit), include.ci = FALSE,
             custom.coef.names = c("Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College = 1)", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)", 
                                   "(Intercept)", 
                                   "Position = Federal Deputy", 
                                   "Position = Governor",
                                   "Position = Minister",
                                   "Position = President",
                                   "Position = Senator",
                                   "Position = Vice-Governor")),
      file = "Tables/Table-for-fig7-alt-models-text.tex")

###domain-based###

#OLS
m2_d <- lm_robust(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r, 
                  fixed_effects = ~position,
                  data = pol)

#probit
m2_d_probit <- glm(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + 
                     posts_logged_r +as.factor(position), family = binomial(link = "probit"), data=pol)

#logit
m2_d_logit <- glm(fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + 
                    posts_logged_r +as.factor(position), family = binomial(link = "logit"), data=pol)

#table output
print(texreg(list(m2_d, m2_d_probit, m2_d_logit), include.ci = FALSE,
             custom.coef.names = c("Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College = 1)", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)", 
                                   "(Intercept)", 
                                   "Position = Federal Deputy", 
                                   "Position = Governor",
                                   "Position = Minister",
                                   "Position = President",
                                   "Position = Senator",
                                   "Position = Vice-Governor")),
      file = "Tables/Table-for-fig7-alt-models-domain.tex")

###=======
#Additional Count Outcome Model Specifications
###=======

###text based###

#quaispoisson
m1_c <-  glm(total_fake_posts ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
             family="quasipoisson", data=pol)

#poisson
m1_c_poisson <- glm(total_fake_posts ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
                    family="poisson", data=pol)

#negative binomial
m1_c_nb <- glm.nb(total_fake_posts ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
                  data=pol)

#table
print(texreg(list(m1_c, m1_c_poisson, m1_c_nb), include.ci = FALSE,
             custom.coef.names = c("(Intercept)", "Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)", 
                                   "Position = Federal Deputy", 
                                   "Position = Governor",
                                   "Position = Minister",
                                   "Position = President",
                                   "Position = Senator",
                                   "Position = Vice-Governor")),
      file = "Tables/Table-for-fig8-alt-models-text.tex")

###domain based###

#quasipoisson
m2_c <- glm(total_fake_posts_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
            family="quasipoisson", data=pol)

#poisson
m2_c_poisson <- glm(total_fake_posts_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
                    family="poisson", data=pol)

#negative binomial
m2_c_nb <- glm.nb(total_fake_posts_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r +as.factor(position), 
                  data=pol)

#table
print(texreg(list(m2_c, m2_c_poisson, m2_c_nb), include.ci = FALSE,
             custom.coef.names = c("(Intercept)", "Sex (Male = 1)", "Age (30-44)", 
                                   "Age (45-64)", "Age (65+)", "Education (College =1 )", 
                                   "Electoral alignment (Bolsonaro = 1)", 
                                   "Electoral alignment (Lula = 1)", 
                                   "Ideology", 
                                   "Political Experience", 
                                   "Number of posts (logged)", 
                                   "Position = Federal Deputy", 
                                   "Position = Governor",
                                   "Position = Minister",
                                   "Position = President",
                                   "Position = Senator",
                                   "Position = Vice-Governor")),
      file = "Tables/Table-for-fig8-alt-models-domain.tex")

###=======
#Two-Step Heckman Model
###=======

pol <- pol %>%
  mutate(observed_index_fake_dummy  = fake_dummy > 0,
         observed_index_fake_dummy_url = fake_dummy_url >0)

#dummies for office
pol1 <- pol %>% filter(!position %in% c("presidente", "vice-presidente"))

m1_h <- heckit(observed_index_fake_dummy ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r + as.factor(position),
               total_fake_posts ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r + as.factor(position),
               data = pol1,
               method = '2step')

m2_h <- heckit(observed_index_fake_dummy_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r + as.factor(position),
               total_fake_posts_url ~ sex_imputed + age2 + educ + elec_coalition1 + ideo_alt_r + pol_exp + posts_logged_r + as.factor(position),
               data = pol1,
               method = '2step')

print(texreg(list(m1_h, m2_h), include.ci = FALSE),
      file = "Tables/Table-for-Figure8-twostep.tex")

###=======
#Cross Tabs for Binary Outcomes by Predictor
##=======

#Text-based cross tabs
tab <- pctable(elec_coalition ~ fake_dummy, data = pol, rvlab = "False Stories (Text)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-text-coalition.tex")

tab <- pctable(sex_imputed ~ fake_dummy, data = pol, rvlab = "False Stories (Text)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-text-sex.tex")

tab <- pctable(age2 ~ fake_dummy, data = pol, rvlab = "False Stories (Text)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-text-age.tex")

tab <- pctable(educ ~ fake_dummy, data = pol, rvlab = "False Stories (Text)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-text-educ.tex")

tab <- pctable(ideology_median ~ fake_dummy, data = pol, rvlab = "False Stories (Text)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-text-ideology.tex")

tab <- pctable(pol_exp ~ fake_dummy, data = pol, rvlab = "False Stories (Text)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-text-experience.tex")

tab <- pctable(position ~ fake_dummy, data = pol, rvlab = "False Stories (Text)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-text-office.tex")

#Domain-based cross tabs
tab <- pctable(elec_coalition ~ fake_dummy_url, data = pol, rvlab = "False Stories (domain)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-domain-coalition.tex")

tab <- pctable(sex_imputed ~ fake_dummy_url, data = pol, rvlab = "False Stories (domain)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-domain-sex.tex")

tab <- pctable(age2 ~ fake_dummy_url, data = pol, rvlab = "False Stories (domain)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-domain-age.tex")

tab <- pctable(educ ~ fake_dummy_url, data = pol, rvlab = "False Stories (domain)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-domain-educ.tex")

tab <- pctable(ideology_median ~ fake_dummy_url, data = pol, rvlab = "False Stories (domain)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-domain-ideology.tex")

tab <- pctable(pol_exp ~ fake_dummy_url, data = pol, rvlab = "False Stories (domain)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-domain-experience.tex")

tab <- pctable(position ~ fake_dummy_url, data = pol, rvlab = "False Stories (domain)")
tabsum <- summary(tab, rowpct = TRUE, colpct = FALSE)
print(xtable(tabsum), file = "Tables/cross-tab-domain-office.tex")

###========
#Means for Count Outcomes by Predictor
###========
tmp <- pol %>% dplyr::select(total_fake_posts, total_fake_posts_url, elec_coalition)
datasummary_balance(total_fake_posts~elec_coalition, tmp, 
                    coef_rename = c("total_fake_posts" = "Count (Text)", 
                                    "total_fake_posts_url" = "Count (Domain)"),
                    output = "Tables/means-count-coalition.tex", 
                    dinm=FALSE)

tmp <- pol %>% dplyr::select(total_fake_posts, total_fake_posts_url, sex_imputed)
datasummary_balance(total_fake_posts~sex_imputed, tmp, 
                    coef_rename = c("total_fake_posts" = "Count (Text)", 
                                    "total_fake_posts_url" = "Count (Domain)"),
                    output = "Tables/means-count-sex.tex", 
                    dinm=FALSE)

tmp <- pol %>% dplyr::select(total_fake_posts, total_fake_posts_url, age2)
datasummary_balance(total_fake_posts~age2, tmp, 
                    coef_rename = c("total_fake_posts" = "Count (Text)", 
                                    "total_fake_posts_url" = "Count (Domain)"),
                    output = "Tables/means-count-age.tex", 
                    dinm=FALSE)

tmp <- pol %>% dplyr::select(total_fake_posts, total_fake_posts_url, educ)
datasummary_balance(total_fake_posts~educ, tmp, 
                    coef_rename = c("total_fake_posts" = "Count (Text)", 
                                    "total_fake_posts_url" = "Count (Domain)"),
                    output = "Tables/means-count-educ.tex", 
                    dinm=FALSE)

tmp <- pol %>% dplyr::select(total_fake_posts, total_fake_posts_url, ideology_median)
datasummary_balance(total_fake_posts~ideology_median, tmp, 
                    coef_rename = c("total_fake_posts" = "Count (Text)", 
                                    "total_fake_posts_url" = "Count (Domain)"),
                    output = "Tables/means-count-ideology.tex", 
                    dinm=FALSE)

tmp <- pol %>% dplyr::select(total_fake_posts, total_fake_posts_url, pol_exp)
datasummary_balance(total_fake_posts~pol_exp, tmp, 
                    coef_rename = c("total_fake_posts" = "Count (Text)", 
                                    "total_fake_posts_url" = "Count (Domain)"),
                    output = "Tables/means-count-experience.tex", 
                    dinm=FALSE)

tmp <- pol %>% dplyr::select(total_fake_posts, total_fake_posts_url, position)
datasummary_balance(total_fake_posts~position, tmp, 
                    coef_rename = c("total_fake_posts" = "Count (Text)", 
                                    "total_fake_posts_url" = "Count (Domain)"),
                    output = "Tables/means-count-position.tex", 
                    dinm=FALSE)

